/*
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional information regarding
 * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 * 
 * Started from
 * https://svn.apache.org/repos/asf/stanbol/trunk/enhancement-engines/htmlextractor/src/
 * main/java/org/apache/stanbol/enhancer/engines/htmlextractor/impl/DOMBuilder.java
 */
package com.digitalpebble.storm.crawler.parse;

import java.util.HashMap;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.html.dom.HTMLDocumentImpl;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;

/**
 * @author <a href="mailto:kasper@dfki.de">Walter Kasper</a>
 */
public final class JSoupDOMBuilder {

    /**
     * Restrict instantiation
     */
    private JSoupDOMBuilder() {
    }

    /**
     * Returns a W3C DOM that exposes the same content as the supplied Jsoup
     * document into a W3C DOM.
     * 
     * @param jsoupDocument
     *            The Jsoup document to convert.
     * @return A W3C Document.
     */
    public static Document jsoup2DOM(org.jsoup.nodes.Document jsoupDocument) {

        Document document = null;

        try {

            /* Obtain the document builder for the configured XML parser. */
            DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory
                    .newInstance();
            DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();

            /* Create a document to contain the content. */
            document = docBuilder.newDocument();
            createDOM(jsoupDocument, document, document,
                    new HashMap<String, String>());

        } catch (ParserConfigurationException pce) {
            throw new RuntimeException(pce);
        }

        return document;
    }

    public static DocumentFragment jsoup2HTML(
            org.jsoup.nodes.Document jsoupDocument) {
        HTMLDocumentImpl htmlDoc = new HTMLDocumentImpl();
        htmlDoc.setErrorChecking(false);
        DocumentFragment fragment = htmlDoc.createDocumentFragment();
        createDOM(jsoupDocument, fragment, htmlDoc,
                new HashMap<String, String>());
        return fragment;
    }

    /**
     * The internal helper that copies content from the specified Jsoup
     * <tt>Node</tt> into a W3C {@link Node}.
     * 
     * @param node
     *            The Jsoup node containing the content to copy to the specified
     *            W3C {@link Node}.
     * @param out
     *            The W3C {@link Node} that receives the DOM content.
     */
    public static void createDOM(org.jsoup.nodes.Node node, Node out,
            Document doc, Map<String, String> ns) {

        if (node instanceof org.jsoup.nodes.Document) {

            org.jsoup.nodes.Document d = ((org.jsoup.nodes.Document) node);
            for (org.jsoup.nodes.Node n : d.childNodes()) {
                createDOM(n, out, doc, ns);
            }

        } else if (node instanceof org.jsoup.nodes.Element) {

            org.jsoup.nodes.Element e = ((org.jsoup.nodes.Element) node);
            org.w3c.dom.Element _e = doc.createElement(e.tagName());
            out.appendChild(_e);
            org.jsoup.nodes.Attributes atts = e.attributes();

            for (org.jsoup.nodes.Attribute a : atts) {
                String attName = a.getKey();
                // omit xhtml namespace
                if (attName.equals("xmlns")) {
                    continue;
                }
                String attPrefix = getNSPrefix(attName);
                if (attPrefix != null) {
                    if (attPrefix.equals("xmlns")) {
                        ns.put(getLocalName(attName), a.getValue());
                    } else if (!attPrefix.equals("xml")) {
                        String namespace = ns.get(attPrefix);
                        if (namespace == null) {
                            // fix attribute names looking like qnames
                            attName = attName.replace(':', '_');
                        }
                    }
                }
                _e.setAttribute(attName, a.getValue());
            }

            for (org.jsoup.nodes.Node n : e.childNodes()) {
                createDOM(n, _e, doc, ns);
            }

        } else if (node instanceof org.jsoup.nodes.TextNode) {

            org.jsoup.nodes.TextNode t = ((org.jsoup.nodes.TextNode) node);
            if (!(out instanceof Document)) {
                out.appendChild(doc.createTextNode(t.text()));
            }
        }
    }

    // some hacks for handling namespace in jsoup2DOM conversion
    private static String getNSPrefix(String name) {
        if (name != null) {
            int pos = name.indexOf(':');
            if (pos > 0) {
                return name.substring(0, pos);
            }
        }
        return null;
    }

    private static String getLocalName(String name) {
        if (name != null) {
            int pos = name.lastIndexOf(':');
            if (pos > 0) {
                return name.substring(pos + 1);
            }
        }
        return name;
    }
}
